library(RColorBrewer)
library(scales)
library(ISLR)
library(MASS)
library(mvtnorm)
cols <- brewer.pal(9, "Set1")


Logistic Regression

Simple example

Let us generate some data: it is the very lazy way of verifying that the implementation of a model is correct.

set.seed(1234)
n <- 500
X <- rnorm(n, 3, 5)
eta <- 2 - 0.3 * X

We will also need a function that computes the logit transformation \[F(\eta) = \dfrac{1}{1 + e^{-\eta}}\]

logit <- function(x){
  return (1/(1 + exp(-x)))
}

We can then generate the data with the implied probabilities

Y <- array(NA, dim = n)
for (i in 1:n){
  p_i <- logit(eta[i])
  Y[i] <- sample(0:1, 1, TRUE, c(1 - p_i, p_i))
}

par(mar=c(4,4,2,2), family = 'serif')
plot(X, jitter(rep(0, n)), pch = 16, col = alpha(cols[Y + 1], 0.6), 
     xlab = 'X', ylab = '', ylim = c(0, 1))
legend('topright', legend = c('0','1'), lwd = 2, col = cols[1:2])

We fit the logistic regression

fit_glm <- glm(Y ~ X, family = 'binomial')
round(coef(fit_glm), 3)
(Intercept)           X 
      2.036      -0.286 

The coefficients are very close to the ground truth. Things seem to be working! Let us make prediction.

X_gr <- data.frame(X = seq(min(X), max(X), length.out = 100))
p_gr <- as.numeric(predict(fit_glm, newdata = X_gr, type = "response"))

par(mar=c(4,4,2,2), family = 'serif')
plot(X, jitter(rep(0, n)), pch = 16, col = alpha(cols[Y + 1], 0.6), 
     xlab = 'X', ylab = '', ylim = c(0, 1))
lines(X_gr$X, p_gr, lwd = 2)
legend('topright', legend = c('0','1','P(Y = 1)'), lwd = 2, col = c(cols[1:2], 'black'))

We can also add the horizontal line corresponding to probabilities equal to \(50\%\). This allows us to show the decision boundary, i.e. the point at which the prediction switches from one class to another.

boundary <- X_gr$X[which.min(abs(p_gr - 0.5))]
par(mar=c(4,4,2,2), family = 'serif')
plot(X, jitter(rep(0, n)), pch = 16, col = alpha(cols[Y + 1], 0.6), xlab = 'X', ylab = '', ylim = c(0, 1))
lines(X_gr$X, p_gr, lwd = 2)
abline(h = 0.5, lwd = 2, lty = 3)
abline(v = boundary, lwd = 2, lty = 3)
legend('topright', legend = c('0','1','P(Y = 1)'), lwd = 2, col = c(cols[1:2], 'black'))

Default data

Let us load and visualize the Default data.

rm(list=ls())
cols <- brewer.pal(9, "Set1")

data("Default")
head(Default)
  default student   balance    income
1      No      No  729.5265 44361.625
2      No     Yes  817.1804 12106.135
3      No      No 1073.5492 31767.139
4      No      No  529.2506 35704.494
5      No      No  785.6559 38463.496
6      No     Yes  919.5885  7491.559
par(mar=c(4,4,2,2), family = 'serif')
plot(Default$balance, Default$income, pch = 16, 
     col = alpha(cols[Default$default], 0.6), 
     xlab = 'balance', ylab = 'income')

The class of people who do not default is way more representative (~ \(97\%\) of the observations). It is hard to spot any patterns since the red points are covering the blue points. We can subsample only some of the red points and show a balanced dataset where the proportion of defaults vs. non-defaults is \(50\%-50\%\).

set.seed(123)
idx_resampl <- c(which(Default$default == 'Yes'), sample(which(Default$default == 'No'), sum(Default$default == 'Yes'), replace = FALSE)) # concatenate all defaults with subsample of non defaults
Default_pl <- Default[idx_resampl,]

par(mar=c(4,4,2,2), family = 'serif')
plot(Default_pl$balance, Default_pl$income, pch = 16, col = alpha(cols[Default_pl$default], 0.6), 
     xlab = 'balance', ylab = 'income')

We can show the marginal distributions for the predictors used in the model, stratified by outcome variable.

par(mar=c(4,4,2,2), mfrow = c(1,2), family = 'serif')
boxplot(balance ~ default, data = Default, col = cols[1:2], pch = 16)
boxplot(income ~ default, data = Default, col = cols[1:2], pch = 16)

A first approach could be to use a simple linear model.

\[\mathbb{P}(y_i = 1 \mid x_i)=\textbf{x}_i^{\intercal} \mathbf{\beta}\]

This is called the linear probability model: the probability of a “yes” outcome (\(y = 1\)) is linear in \(\textbf{x}\).

Default$numeric_def <- as.numeric(Default$default) - 1

fit_lm <- lm(numeric_def ~ balance + income, data = Default)
summary(fit_lm)

Call:
lm(formula = numeric_def ~ balance + income, data = Default)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.24607 -0.06989 -0.02663  0.02005  0.98554 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -9.224e-02  5.788e-03 -15.936  < 2e-16 ***
balance      1.318e-04  3.514e-06  37.511  < 2e-16 ***
income       4.605e-07  1.274e-07   3.613 0.000304 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.168 on 9997 degrees of freedom
Multiple R-squared:  0.1237,    Adjusted R-squared:  0.1236 
F-statistic: 705.8 on 2 and 9997 DF,  p-value: < 2.2e-16

After we obtain these “probabilities”, we can impute the response to be 1 if, for example, the probability is larger than \(0.5\).

Y_hat <- as.numeric(predict(fit_lm, newdata = Default))
Y_hat <- ifelse(Y_hat > 0.5, 1, 0)
conf_mat <- table(pred = Y_hat, true = Default$numeric_def)
sum(diag(conf_mat))/sum(conf_mat) # in-sample accuracy
[1] 0.9667

The in sample performance seems excellent! How well are we doing? Note that \(96.67\%\) of the training set is not default. Thus “not default” is the most likely outcome. So a reasonable baseline or “naive model” is one that guesses “not default” for every training set instance.

How well does this null model (or naive classifier) perform on the training set? It is about \(96.67\%\), since it gets all the 0’s right and all the 1’s wrong.

table(Default$numeric_def)

   0    1 
9667  333 
max(table(Default$numeric_def))/sum(table(Default$numeric_def)) # in sample accuracy of naive classifier
[1] 0.9667

The mystery is solved: our linear probability model is predicting always \(0\), so it is not smarter than guessing “not default” for every instance.

The linear probability model has one obvious problem: it can produce fitted probabilities that fall outside \((0,1)\). This is just wrong, you should never use a linear model to predict probabilities! Let us learn something more correct: the logistic regression.

Now we have \[\mathbb{P}(y_i = 1 \mid \textbf{x}_i)=F(\textbf{x}_i^\intercal \mathbf{\beta}) = \dfrac{e^{\textbf{x}_i^\intercal \mathbf{\beta}}}{1 + e^{\textbf{x}_i^\intercal \mathbf{\beta}}} = \dfrac{1}{1 + e^{-\textbf{x}_i^\intercal \mathbf{\beta}}}.\]

fit_glm <- glm(numeric_def ~ balance + income, data = Default, family = 'binomial')
round(coef(fit_glm), 3)
(Intercept)     balance      income 
    -11.540       0.006       0.000 

Recall our model is

\[\text{odds}=\dfrac{p}{1−p} = e^{\beta_0} e^{\beta_1 x_1} \dots e^{\beta_p x_p}\]

So \(e^{\beta_j}\) is an odds multiplier or odds ratio for for a one-unit increase in the feature \(x_j\), keeping all the others fixed.

The \(\beta\) for balance is \(0.006\). So having an \(100\$\) extra on the bank account multiplies odds of default by \(e^{0.6} \approx 1.82\), all others factors being equal.

Let us visualize the decision boundary.

balance_gr <- seq(min(Default$balance), max(Default$balance), length.out = 100)
income_gr <- seq(min(Default$income), max(Default$income), length.out = 100)

X_gr <- expand.grid(balance = balance_gr, income = income_gr)
p_gr <- as.numeric(predict(fit_glm, newdata = X_gr, type = "response"))

par(mar=c(4,4,2,2), family = 'serif')
plot(Default_pl$balance, Default_pl$income, pch = 16, col = alpha(cols[Default_pl$default], 0.6), xlab = 'balance', ylab = 'income')
contour(x = balance_gr, y = income_gr, 
        z = matrix(p_gr, length(balance_gr), length(income_gr), byrow = F),
        levels = 0.5, lwd = 2, add = TRUE)

Remember: we have way more data than the ones that I am showing you!

par(mar=c(4,4,2,2), family = 'serif')
plot(Default$balance, Default$income, pch = 16, col = alpha(cols[Default$default], 0.6), xlab = 'balance', ylab = 'income')
contour(x = balance_gr, y = income_gr, 
        z = matrix(p_gr, length(balance_gr), length(income_gr), byrow = F),
        levels = 0.5, lwd = 2, add = TRUE)

We can also calculate the accuracy.

p_hat <- as.numeric(predict(fit_glm, newdata = Default, type = "response"))
Y_hat <- ifelse(p_hat > 0.5, 1, 0)
conf_mat <- table(pred = Y_hat, true = Default$numeric_def)
conf_mat
    true
pred    0    1
   0 9629  225
   1   38  108
sum(diag(conf_mat))/sum(conf_mat) # in-sample accuracy
[1] 0.9737

That is a sad conclusion, we are not improving by much. But at least now we are getting some of those positive outcomes! Also remember that we can actually change the thresholding value!

We show here how the true positives and true negatives evolve for different values of the threshold \(s\). The code is not displayed here, I leave it as an exercise (it will be requested in the next homework). Hint: you just need to use a for loop and at every iteration change the value of the threshold \(s\) and compute FPR, FNR, …

ROC (receiver operator characteristics) curve: looks at the True Positives and False Positives for varying levels of \(s\).

We can use a package in order to compute the ROC curve. Try a simple Google search and you will find dozens of packages that do it. For example, let us try ROSE.

library(ROSE)
roc.curve(Default$default, p_hat)

Area under the curve (AUC): 0.949


LDA, QDA

Let us load data containing the characteristics of three species of flowers.

rm(list=ls())
cols <- brewer.pal(7, "Set2")

data(iris)
iris <- iris[,c(1:2, 5)]

n <- nrow(iris)

par(mar=c(4,4,2,2), family = 'serif')
plot(jitter(iris$Sepal.Length), jitter(iris$Sepal.Width), 
     col = cols[iris$Species], pch = 16, 
     xlab = 'Length', ylab = 'Width')

Bayes’ rule: \[\mathbb{P}(Y = k \mid \mathbf{x}) = \frac{\mathbb{P}(\mathbf{x} \mid Y = k) \mathbb{P}(Y = k)}{\mathbb{P}(\mathbf{x})}\]

\(\mathbb{P}(\mathbf{x})\) is the marginal probability of observing feature vector \(\mathbf{x}\). Notice that it does not depend on \(k\) (it is the same number for all classes). Thus, we usually write the posterior probabilities up to this constant of proportionality, without bothering to compute it: \[\mathbb{P}(Y = k \mid \mathbf{x}) \propto \mathbb{P}(\mathbf{x} \mid Y = k) \mathbb{P}(Y = k)\]

Note: often we do the actual computations on a log scale instead (for numerical stability).

The hard part is estimating the likelihood \(\mathbb{P}(\mathbf{x} \mid Y = k)\). In words: how likely is it that we would have observed feature vector \(\mathbf{x}\) if the class label were \(k\)? \[ \begin{align} \mathbb{P}(Y = k \mid \mathbf{x}) \propto \pi_{k} N(\mathbf{x} \mid \boldsymbol{\mu}_{k}, \Sigma_{k}) \end{align} \]

We could estimate the LDA/QDA parameters by hand, without having to rely on implementations existing in R.

idx1 <- which(iris$Species == 'setosa') # indexes first class
idx2 <- which(iris$Species == 'versicolor') # indexes second class
idx3 <- which(iris$Species == 'virginica') # indexes third class

n1 <- sum(iris$Species == 'setosa') # sample size first class
n2 <- sum(iris$Species == 'versicolor') # sample size second class
n3 <- sum(iris$Species == 'virginica') # sample size third class

mu1 <- as.numeric(colMeans(iris[idx1,1:2])) # mean first class
mu2 <- as.numeric(colMeans(iris[idx2,1:2])) # mean second class
mu3 <- as.numeric(colMeans(iris[idx3,1:2])) # mean third class

Sigma1 <- as.matrix(cov(iris[idx1,1:2])) # covariance matrix first class
Sigma2 <- as.matrix(cov(iris[idx2,1:2])) # covariance matrix second class
Sigma3 <- as.matrix(cov(iris[idx3,1:2])) # covariance matrix third class

# Pooled estimate for the covariance (slide seen in class)
Sigma <- 1/(n - 3) * ((n1 - 1) * Sigma1 + (n2 - 1) * Sigma2 + (n3 - 1) * Sigma3)

Or we can simply use the R implementation!

lda_fit <- lda(iris[,1:2], iris$Species)
lda_fit
Call:
lda(iris[, 1:2], iris$Species)

Prior probabilities of groups:
    setosa versicolor  virginica 
 0.3333333  0.3333333  0.3333333 

Group means:
           Sepal.Length Sepal.Width
setosa            5.006       3.428
versicolor        5.936       2.770
virginica         6.588       2.974

Coefficients of linear discriminants:
                   LD1        LD2
Sepal.Length -2.141178 -0.8152721
Sepal.Width   2.768109 -2.0960764

Proportion of trace:
   LD1    LD2 
0.9628 0.0372 
lda_pred <- predict(lda_fit, iris[,1:2])

We can show the confusion matrix.

# Confusion matrix:
table(true = iris$Species, pred = lda_pred$class)
            pred
true         setosa versicolor virginica
  setosa         49          1         0
  versicolor      0         36        14
  virginica       0         15        35

And the decision boundaries.

x  <- seq(min(iris[,1]), max(iris[,1]), length.out = 100)
y  <- seq(min(iris[,2]), max(iris[,2]), length.out = 100)
xy <- expand.grid(Sepal.Length = x, Sepal.Width = y)

z  <- predict(lda_fit, xy)$post

z1 <- z[,1] - pmax(z[,2], z[,3]) # boundary of class 1
z2 <- z[,2] - pmax(z[,1], z[,3]) # boundary of class 2
z3 <- z[,3] - pmax(z[,1], z[,2]) # boundary of class 3

f1 <- matrix(dmvnorm(xy, mu1, Sigma), 100, 100)
f2 <- matrix(dmvnorm(xy, mu2, Sigma), 100, 100)
f3 <- matrix(dmvnorm(xy, mu3, Sigma), 100, 100)

par(mar=c(4,4,2,2), family = 'serif')
plot(jitter(iris$Sepal.Length), jitter(iris$Sepal.Width), 
     col = cols[iris$Species], pch = 16, 
     xlab = 'Length', ylab = 'Width')
contour(x, y, matrix(z1, 100, 100), levels = 0, lwd = 2, lty = 2, 
        drawlabels = F, add = T) 
contour(x, y, matrix(z2, 100, 100), levels = 0, lwd = 2, lty = 2, 
        drawlabels = F, add = T) 
# contour(x, y, matrix(z3, 100, 100), levels = 0, lwd = 2, lty = 2, 
#         drawlabels = F, add = T) 
contour(x, y, f1, col = cols[1], lwd = 2, levels = 0.3, drawlabels = F, add = T) 
contour(x, y, f2, col = cols[2], lwd = 2, levels = 0.3, drawlabels = F, add = T) 
contour(x, y, f3, col = cols[3], lwd = 2, levels = 0.3, drawlabels = F, add = T) 

How does QDA compare with LDA? Remember the fundamental difference. We allow here the covariance matrices to be difference under the three classes. We are estimating more parameters (the model is more flexible).

qda_fit <- qda(iris[,1:2], iris$Species)
qda_fit
Call:
qda(iris[, 1:2], iris$Species)

Prior probabilities of groups:
    setosa versicolor  virginica 
 0.3333333  0.3333333  0.3333333 

Group means:
           Sepal.Length Sepal.Width
setosa            5.006       3.428
versicolor        5.936       2.770
virginica         6.588       2.974
qda_pred <- predict(qda_fit, iris[,1:2])
# Confusion matrix:
table(true = iris$Species, pred = qda_pred$class)
            pred
true         setosa versicolor virginica
  setosa         49          1         0
  versicolor      0         37        13
  virginica       0         16        34
z  <- predict(qda_fit, xy)$post

z1 <- z[,1] - pmax(z[,2], z[,3]) # boundary of class 1
z2 <- z[,2] - pmax(z[,1], z[,3]) # boundary of class 2
z3 <- z[,3] - pmax(z[,1], z[,2]) # boundary of class 3

f1 <- matrix(dmvnorm(xy, mu1, Sigma1), 100, 100)
f2 <- matrix(dmvnorm(xy, mu2, Sigma2), 100, 100)
f3 <- matrix(dmvnorm(xy, mu3, Sigma3), 100, 100)

par(mar=c(4,4,2,2), family = 'serif')
plot(jitter(iris$Sepal.Length), jitter(iris$Sepal.Width), 
     col = cols[iris$Species], pch = 16, 
     xlab = 'Length', ylab = 'Width')
contour(x, y, matrix(z1, 100, 100), levels = 0, lwd = 2, lty = 2, 
        drawlabels = F, add = T) 
contour(x, y, matrix(z2, 100, 100), levels = 0, lwd = 2, lty = 2, 
        drawlabels = F, add = T)
# contour(x, y, matrix(z3, 100, 100), levels = 0, lwd = 2, lty = 2, 
#         drawlabels = F, add = T) 
contour(x, y, f1, col = cols[1], lwd = 2, levels = 0.3, drawlabels = F, add = T) 
contour(x, y, f2, col = cols[2], lwd = 2, levels = 0.3, drawlabels = F, add = T) 
contour(x, y, f3, col = cols[3], lwd = 2, levels = 0.3, drawlabels = F, add = T) 

See how the decision boundaries changed?


Naive Bayes

Naive Bayes is very useful for high dimensional applications. Why? If we have \(p\) features, LDA and QDA would need to estimate a huge \(p \times p\) covariance matrix for the normal distribution on \(X \mid Y = k\).

Recall that \(\mathbf{x} = (x_1,x_2, \dots, x_p)\) is a vector of \(p\) features. Here our strategy for estimating \(\mathbb{P}(\mathbf{x} \mid Y = k)\) is “Naive Bayes”. It is “naive” because we make the simplifying assumption that every feature \(x_j\) is independent of all other features: \[\mathbb{P}(\mathbf{x} \mid Y = k) = \prod_{j=1}^p \mathbb{P}(x_j \mid Y = k)\]

This simplifies the requirements of the problem: just estimate the marginal distribution of the features, i.e. \[\mathbb{P}(x_j \mid Y = k)\] for all features \(j\) and classes \(k\).

In this case we can choose \[\mathbb{P}(x_j \mid Y = k) = N(x_j \mid \mu_{kj}, \sigma_{kj}^{2})\]

We first can estimate the parameters under the three classes.

idx1 <- which(iris$Species == 'setosa') # indexes first class
idx2 <- which(iris$Species == 'versicolor') # indexes second class
idx3 <- which(iris$Species == 'virginica') # indexes third class

pi1 <- sum(iris$Species == 'setosa')/n # proportion first class
pi2 <- sum(iris$Species == 'versicolor')/n # proportion second class
pi3 <- sum(iris$Species == 'virginica')/n # proportion third class

mu1 <- as.numeric(colMeans(iris[idx1,1:2])) # mean first class
mu2 <- as.numeric(colMeans(iris[idx2,1:2])) # mean second class
mu3 <- as.numeric(colMeans(iris[idx3,1:2])) # mean third class

sigma1 <- diag(apply(iris[idx1,1:2], 2, var)) # diagonal covariance matrix first class
sigma2 <- diag(apply(iris[idx2,1:2], 2, var)) # diagonal covariance matrix second class
sigma3 <- diag(apply(iris[idx3,1:2], 2, var)) # diagonal covariance matrix third class

We can then use the dmvnorm function to evaluate the log posteriors. Remember that \[\log (\text{posterior}) \propto \log (\text{likelihood}) + \log (\text{prior})\]

post1_prop <- as.numeric(dmvnorm(iris[,1:2], mean = mu1, sigma = sigma1, log = T)) + log(pi1)
post2_prop <- as.numeric(dmvnorm(iris[,1:2], mean = mu2, sigma = sigma2, log = T)) + log(pi2)
post3_prop <- as.numeric(dmvnorm(iris[,1:2], mean = mu3, sigma = sigma3, log = T)) + log(pi3)

Since those are unnormalized, we need an additional step to get actual posterior probabilities.

post1_NB <- exp(post1_prop) / (exp(post1_prop) + exp(post2_prop) + (post3_prop))
post2_NB <- exp(post2_prop) / (exp(post1_prop) + exp(post2_prop) + (post3_prop))
post3_NB <- exp(post3_prop) / (exp(post1_prop) + exp(post2_prop) + (post3_prop))

We can use these probabilities to make prediction (more on this in the next homework).


k-Nearest Neighbours

Simple example of k-nearest neighbours on Caravan data.

library(class) # to load knn

data(Caravan)
dim(Caravan)
[1] 5822   86
head(Caravan)
  MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE MRELGE
1      33        1       3        2        8      0      5      1      3      7
2      37        1       2        2        8      1      4      1      4      6
3      37        1       2        2        8      0      4      2      4      3
4       9        1       3        3        3      2      3      2      4      5
5      40        1       4        2       10      1      4      1      4      7
6      23        1       2        1        5      0      5      0      5      0
  MRELSA MRELOV MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD MOPLLAAG MBERHOOG
1      0      2        1        2        6        1        2        7        1
2      2      2        0        4        5        0        5        4        0
3      2      4        4        4        2        0        5        4        0
4      2      2        2        3        4        3        4        2        4
5      1      2        2        4        4        5        4        0        0
6      6      3        3        5        2        0        5        4        2
  MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO MSKA MSKB1 MSKB2 MSKC MSKD
1        0        1        2        5        2    1     1     2    6    1
2        0        0        5        0        4    0     2     3    5    0
3        0        0        7        0        2    0     5     0    4    0
4        0        0        3        1        2    3     2     1    4    0
5        5        4        0        0        0    9     0     0    0    0
6        0        0        4        2        2    2     2     2    4    2
  MHHUUR MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS MZPART MINKM30 MINK3045 MINK4575
1      1      8     8     0     1       8      1       0        4        5
2      2      7     7     1     2       6      3       2        0        5
3      7      2     7     0     2       9      0       4        5        0
4      5      4     9     0     0       7      2       1        5        3
5      4      5     6     2     1       5      4       0        0        9
6      9      0     5     3     3       9      0       5        2        3
  MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART PWABEDR PWALAND PPERSAUT PBESAUT
1        0        0       4        3       0       0       0        6       0
2        2        0       5        4       2       0       0        0       0
3        0        0       3        4       2       0       0        6       0
4        0        0       4        4       0       0       0        6       0
5        0        0       6        3       0       0       0        0       0
6        0        0       3        3       0       0       0        6       0
  PMOTSCO PVRAAUT PAANHANG PTRACTOR PWERKT PBROM PLEVEN PPERSONG PGEZONG
1       0       0        0        0      0     0      0        0       0
2       0       0        0        0      0     0      0        0       0
3       0       0        0        0      0     0      0        0       0
4       0       0        0        0      0     0      0        0       0
5       0       0        0        0      0     0      0        0       0
6       0       0        0        0      0     0      0        0       0
  PWAOREG PBRAND PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND AWAPART AWABEDR
1       0      5       0        0      0       0        0       0       0
2       0      2       0        0      0       0        0       2       0
3       0      2       0        0      0       0        0       1       0
4       0      2       0        0      0       0        0       0       0
5       0      6       0        0      0       0        0       0       0
6       0      0       0        0      0       0        0       0       0
  AWALAND APERSAUT ABESAUT AMOTSCO AVRAAUT AAANHANG ATRACTOR AWERKT ABROM
1       0        1       0       0       0        0        0      0     0
2       0        0       0       0       0        0        0      0     0
3       0        1       0       0       0        0        0      0     0
4       0        1       0       0       0        0        0      0     0
5       0        0       0       0       0        0        0      0     0
6       0        1       0       0       0        0        0      0     0
  ALEVEN APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED
1      0        0       0       0      1       0        0      0       0
2      0        0       0       0      1       0        0      0       0
3      0        0       0       0      1       0        0      0       0
4      0        0       0       0      1       0        0      0       0
5      0        0       0       0      1       0        0      0       0
6      0        0       0       0      0       0        0      0       0
  ABYSTAND Purchase
1        0       No
2        0       No
3        0       No
4        0       No
5        0       No
6        0       No
summary(Caravan$Purchase)
  No  Yes 
5474  348 
max(table(Caravan$Purchase))/sum(table(Caravan$Purchase)) # this is the baseline performance
[1] 0.9402267

We now split the observations into a test set, containing the first \(1000\) observations, and a training set, containing the remaining observations. We fit a KNN model on the training data using \(K = 1\), and evaluate its performance on the test data.

which(names(Caravan) == 'Purchase')
[1] 86
X <- scale(Caravan[,-which(names(Caravan) == 'Purchase')])

idx_test <- 1:1000
X_tr <- X[-idx_test,]
X_test <- X[idx_test,]
y_tr <- Caravan$Purchase[-idx_test]
y_test <- Caravan$Purchase[idx_test]

knn_pred <- knn(X_tr, X_test, y_tr, k = 1)
mean(y_test != knn_pred)
[1] 0.115
mean(y_test != "No")
[1] 0.059
table(knn_pred, y_test)
        y_test
knn_pred  No Yes
     No  875  49
     Yes  66  10
knn_pred <- knn(X_tr, X_test, y_tr, k = 3)
table(knn_pred, y_test)
        y_test
knn_pred  No Yes
     No  921  54
     Yes  20   5
knn_pred <- knn(X_tr, X_test, y_tr, k = 5)
table(knn_pred, y_test)
        y_test
knn_pred  No Yes
     No  930  55
     Yes  11   4